Variables:
Risk
Money
Security
Good time Help Success Proper Environment Tradition Creativity
Friends important Family important Leisure time Happiness Health (subjective) Satisfaction Freedom
Sex Age Country Wave Marital status Children Employment Education
library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V22, freedom = V46, marital_status = V55, children = V56, creativity = V80, money = V81, security = V82, goodtime = V83, help = V84, success = V85, risk = V86, proper = V87, environment = V88, tradition = V89, employment = V241, education = V238)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom, marital_status, children, creativity, money, security, goodtime, help, success, risk, proper, environment, tradition, employment, education)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia
1003 1002 1421
Brazil Bulgaria Burkina Faso
1500 1001 1534
Canada Chile China
2164 1000 1991
Colombia Cyprus (G) Egypt
3025 1050 3051
Ethiopia Finland France
1500 1014 1001
Georgia Germany Ghana
1500 2064 1534
Great Britain Guatemala Hong Kong
1041 1000 1252
Hungary India Indonesia
1007 2001 2015
Iran Iraq Italy
2667 2701 1012
Japan Jordan Malaysia
1096 1200 1201
Mali Mexico Moldova
1534 1560 1046
Morocco Netherlands New Zealand
1200 1050 954
Norway Peru Poland
1025 1500 1000
Romania Russia Rwanda
1776 2033 1507
Slovenia South Africa South Korea
1037 2988 1200
Spain Sweden Switzerland
1200 1003 1241
Taiwan Thailand Trinidad and Tobago
1227 1534 1002
Turkey Ukraine United States
1346 1000 1249
Uruguay Viet Nam Zambia
1000 1495 1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
` ``{r} #rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, marital_status = V57, children = V58, employment = V229, education = V248, risk = V76, money = V71, security = V72, goodtime = V73, help = V74B, success = V75, proper = V77, environment = V78, tradition = V79, creativity = V70, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V23, freedom = V55 )
#select only the variables of interest
WV6_data <- WV6_data %>%
select(sex, age, country, wave, marital_status, children, employment, education, risk, money, security, goodtime, help, success, proper, environment, tradition, creativity, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia
1200 1030 1100
Australia Azerbaijan Belarus
1477 1002 1535
Brazil Chile China
1486 1000 2300
Colombia Cyprus (G) Ecuador
1512 1000 1202
Egypt Estonia Georgia
1523 1533 1202
Germany Ghana Haiti
2046 1552 1996
Hong Kong India Iraq
1000 4078 1200
Japan Jordan Kazakhstan
2443 1200 1500
Kuwait Kyrgyzstan Lebanon
1303 1500 1200
Libya Malaysia Mexico
2131 1300 2000
Morocco Netherlands New Zealand
1200 1902 841
Nigeria Pakistan Palestine
1759 1200 1000
Peru Philippines Poland
1210 1200 966
Qatar Romania Russia
1060 1503 2500
Rwanda Singapore Slovenia
1527 1972 1069
South Africa South Korea Spain
3531 1200 1189
Sweden Taiwan Thailand
1206 1238 1200
Trinidad and Tobago Tunisia Turkey
999 1205 1605
Ukraine United States Uruguay
1500 2232 1000
Uzbekistan Yemen Zimbabwe
1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 80
#number of participants
nrow(data)
[1] 173540
#exclusion of participants
data = subset(data, risk > 0 & sex > 0 & age > 0 & education > 0 & employment > 0 & marital_status > 0 & children >= 0 & family_important > 0 & friends_important > 0 & leisure_time > 0 & happiness > 0 & health > 0 & satisfaction > 0 & freedom > 0 & marital_status > 0 & creativity > 0 & money > 0 & security > 0 & goodtime >0 & help > 0 & success > 0, risk > 0 & proper > 0 & environment > 0 & tradition > 0 & employment > 0 & education > 0)
data
#number of males vs females (1 = males; 2 = females)
table(data$sex)
1 2
71689 77937
#create a categorical age variable
data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"
#gender variables
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
#average age of participants
mean(data$age)
[1] 41.59569
#age range
range(data$age)
[1] 15 99
#risk taking Frequency
library(ggplot2)
ggplot(data, aes(x = risk)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#age frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age vs risk taking
ggplot(data, aes(x = agecat, y = risk)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
NA
NA
#sex vs risk taking
ggplot(data, aes(as.factor(sex), risk))+
geom_boxplot()
summary(data)
sex age country wave family_important friends_important leisure_time happiness health satisfaction
Length:149626 Min. :15.0 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:28.0 1st Qu.:276.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000
Mode :character Median :39.0 Median :484.0 Median :6.000 Median : 1.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 7.000
Mean :41.6 Mean :481.5 Mean :5.552 Mean : 1.094 Mean : 1.661 Mean : 1.871 Mean : 1.865 Mean : 2.106 Mean : 6.755
3rd Qu.:53.0 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000
Max. :99.0 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 5.000 Max. :10.000
NA's :221 NA's :351 NA's :698 NA's :573 NA's :230 NA's :340
freedom marital_status children creativity money security goodtime help success risk
Min. :-5.000 Min. :1.000 Min. :0.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :-5.000 Min. :1.000
1st Qu.: 6.000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.: 2.000 1st Qu.:3.000
Median : 7.000 Median :1.000 Median :2.000 Median : 3.000 Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.00 Median : 3.000 Median :4.000
Mean : 7.004 Mean :2.715 Mean :1.843 Mean : 2.718 Mean : 3.846 Mean : 2.374 Mean : 3.273 Mean : 2.29 Mean : 2.951 Mean :3.801
3rd Qu.: 9.000 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.00 3rd Qu.: 4.000 3rd Qu.:5.000
Max. :10.000 Max. :6.000 Max. :8.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.00 Max. : 6.000 Max. :6.000
NA's :838 NA's :972 NA's :602 NA's :442 NA's :566 NA's :44862 NA's :703
proper environment tradition employment education country_lab agecat
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000 Min. :1.000 Length:149626 Length:149626
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character
Median : 2.000 Median : 2.000 Median : 2.000 Median :3.000 Median :5.000 Mode :character Mode :character
Mean : 2.533 Mean : 2.468 Mean : 2.511 Mean :3.406 Mean :5.501
3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :8.000 Max. :9.000
NA's :541 NA's :561 NA's :518
#data cleaning: deletion of NAs
data = na.omit(data)
summary(data)
sex age country wave family_important friends_important leisure_time
Length:101172 Min. :15.00 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:27.00 1st Qu.:268.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
Mode :character Median :39.00 Median :458.0 Median :5.000 Median : 1.000 Median : 2.000 Median : 2.000
Mean :41.11 Mean :474.4 Mean :5.348 Mean : 1.099 Mean : 1.652 Mean : 1.893
3rd Qu.:53.00 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000
Max. :99.00 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000
happiness health satisfaction freedom marital_status children creativity
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :1.000 Min. :0.000 Min. :-5.000
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000 1st Qu.: 5.00 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000
Median : 2.000 Median : 2.000 Median : 7.000 Median : 7.00 Median :1.000 Median :2.000 Median : 2.000
Mean : 1.889 Mean : 2.098 Mean : 6.692 Mean : 6.91 Mean :2.769 Mean :1.835 Mean : 2.699
3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000 3rd Qu.: 9.00 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000
Max. : 4.000 Max. : 5.000 Max. :10.000 Max. :10.00 Max. :6.000 Max. :8.000 Max. : 6.000
money security goodtime help success risk proper
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000 Min. :-5.000
1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:3.000 1st Qu.: 1.000
Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.000 Median : 3.000 Median :4.000 Median : 2.000
Mean : 3.842 Mean : 2.363 Mean : 3.243 Mean : 2.281 Mean : 2.937 Mean :3.827 Mean : 2.538
3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.: 3.000
Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :6.000 Max. : 6.000
environment tradition employment education country_lab agecat education_cat
Min. :-5.000 Min. :-5.00 Min. :1.000 Min. :1.000 Length:101172 Length:101172 Length:101172
1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character Class :character
Median : 2.000 Median : 2.00 Median :3.000 Median :5.000 Mode :character Mode :character Mode :character
Mean : 2.452 Mean : 2.51 Mean :3.467 Mean :5.309
3rd Qu.: 3.000 3rd Qu.: 3.00 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.00 Max. :8.000 Max. :9.000
#ris vs education
ggplot(data, aes(risk, education))+
geom_point()+
geom_smooth(method = "lm")
model = lm(risk ~ education, data = data)
summary(model)
Call:
lm(formula = risk ~ education, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.0532 -1.0532 0.1564 1.2612 2.3660
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.10560 0.01183 347.08 <2e-16 ***
education -0.05240 0.00202 -25.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.589 on 101170 degrees of freedom
Multiple R-squared: 0.00661, Adjusted R-squared: 0.0066
F-statistic: 673.1 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(risk, freedom))+
geom_point()+
geom_smooth(method = "lm")
model1 = lm(risk ~ freedom, data = data)
summary(model1)
Call:
lm(formula = risk ~ freedom, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.3968 -1.1100 0.1769 1.2247 2.3204
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.157773 0.014987 277.43 <2e-16 ***
freedom -0.047814 0.002045 -23.38 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.59 on 101170 degrees of freedom
Multiple R-squared: 0.005375, Adjusted R-squared: 0.005365
F-statistic: 546.7 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(as.factor(wave), risk))+
geom_boxplot()
ggplot(data, aes(risk, age))+
geom_point()+
geom_smooth(method = "lm")
attach(data)
data$education_cat[education < 3] = "incomplete or no primary education"
data$education_cat[education > 2 & education <= 6] <- "no uni"
data$education_cat[education >= 7] <- "uni"
detach(data)
table(data$education)
1 2 3 4 5 6 7 8 9
9751 9603 18657 11323 29208 10845 24715 10923 24601
data
data$wave[data$wave == 5] <- "Wave 5"
data$sex[data$wave == 6] <- "Wave 6"
data
# Check the unique responses of each variable with frequencies
for (col_name in names(data)) {
response_table <- table(data[[col_name]])
print(paste("Response frequencies for", col_name, ":"))
print(response_table)
}
[1] "Response frequencies for sex :"
female male
52327 48845
[1] "Response frequencies for age :"
15 16 17 18 19 20 21 22 23 24 25
56 374 528 2379 2234 2592 2274 2519 2573 2492 2615
26 27 28 29 30 31 32 33 34 35 36
2365 2481 2415 2210 2738 1998 2264 2047 2049 2540 2090
37 38 39 40 41 42 43 44 45 46 47
1993 2146 1931 2535 1720 2184 1899 1793 2132 1667 1637
48 49 50 51 52 53 54 55 56 57 58
1668 1592 2008 1415 1592 1400 1405 1595 1363 1342 1217
59 60 61 62 63 64 65 66 67 68 69
1037 1453 989 1138 1046 899 1164 885 791 810 652
70 71 72 73 74 75 76 77 78 79 80
924 554 624 539 478 488 434 372 312 230 263
81 82 83 84 85 86 87 88 89 90 91
193 180 149 129 125 65 45 29 29 29 12
92 93 94 95 97 98 99
8 6 10 5 4 3 2
[1] "Response frequencies for country :"
12 20 32 36 76 100 124 152 156 158 170
934 1001 978 1336 2867 935 2094 963 3664 1225 1458
196 218 231 246 250 268 275 276 288 332 348
1029 1196 1479 1004 993 2600 918 2931 1513 1868 999
356 360 364 368 392 400 410 414 422 434 458
1320 1922 2545 1093 2598 1178 1199 985 1000 1781 1197
466 484 498 504 528 578 586 604 616 642 643
1187 1500 1028 1803 2566 1015 1108 1421 986 1530 1907
646 702 704 705 710 724 752 756 764 780 788
2573 1923 1406 1001 6241 2216 985 1219 2548 995 1024
792 804 818 826 854 858 887 894
1302 939 4549 1006 1250 983 739 1419
[1] "Response frequencies for wave :"
5 6
65923 35249
[1] "Response frequencies for family_important :"
-5 -2 -1 1 2 3 4
33 89 65 91729 8132 852 272
[1] "Response frequencies for friends_important :"
-5 -2 -1 1 2 3 4
26 178 151 47663 41137 10256 1761
[1] "Response frequencies for leisure_time :"
-5 -2 -1 1 2 3 4
56 347 314 34842 43129 18251 4233
[1] "Response frequencies for happiness :"
-5 -2 -1 1 2 3 4
17 177 434 29505 53848 13997 3194
[1] "Response frequencies for health :"
-5 -2 -1 1 2 3 4 5
5 148 81 25633 45401 23491 6359 54
[1] "Response frequencies for satisfaction :"
-5 -2 -1 1 2 3 4 5 6
25 340 252 3324 2249 4386 5420 13413 11842
7 8 9 10
16887 20550 10143 12341
[1] "Response frequencies for freedom :"
-5 -2 -1 1 2 3 4 5 6
45 335 873 2981 1639 3126 4564 12576 11942
7 8 9 10
16907 18755 10284 17145
[1] "Response frequencies for marital_status :"
1 2 3 4 5 6
55216 7851 3296 1875 5813 27121
[1] "Response frequencies for children :"
0 1 2 3 4 5 6 7 8
31045 16203 24841 13858 6859 3656 2047 1146 1517
[1] "Response frequencies for creativity :"
-5 -2 -1 1 2 3 4 5 6
15 193 655 20992 29096 23097 13553 9933 3638
[1] "Response frequencies for money :"
-5 -2 -1 1 2 3 4 5 6
16 148 302 8513 14253 17681 17200 27821 15238
[1] "Response frequencies for security :"
-5 -2 -1 1 2 3 4 5 6
10 183 276 30142 31845 19335 11079 6237 2065
[1] "Response frequencies for goodtime :"
-5 -2 -1 1 2 3 4 5 6
19 252 275 13965 22171 21433 17525 16879 8653
[1] "Response frequencies for help :"
-5 -2 -1 1 2 3 4 5 6
11 185 183 27950 35868 21809 10599 3503 1064
[1] "Response frequencies for success :"
-5 -2 -1 1 2 3 4 5 6
16 239 389 17947 25555 22555 16029 13450 4992
[1] "Response frequencies for risk :"
1 2 3 4 5 6
9808 15160 17160 16901 24844 17299
[1] "Response frequencies for proper :"
-5 -2 -1 1 2 3 4 5 6
11 175 296 25321 31522 20186 12496 8388 2777
[1] "Response frequencies for environment :"
-5 -2 -1 1 2 3 4 5 6
24 157 459 24324 33665 22423 12704 5218 2198
[1] "Response frequencies for tradition :"
-5 -2 -1 1 2 3 4 5 6
18 155 248 29740 28611 18479 11569 8246 4106
[1] "Response frequencies for employment :"
1 2 3 4 5 6 7 8
31967 8190 13340 11846 15128 7656 10934 2111
[1] "Response frequencies for education :"
1 2 3 4 5 6 7 8 9
7780 7420 14060 7904 18308 7321 16517 7162 14700
[1] "Response frequencies for country_lab :"
Algeria Andorra
934 1001
Argentina Australia
978 1336
Brazil Bulgaria
2867 935
Burkina Faso Canada
1250 2094
Chile China
963 3664
Colombia Cyprus (G)
1458 1029
Ecuador Egypt
1196 4549
Ethiopia Finland
1479 1004
France Georgia
993 2600
Germany Ghana
2931 1513
Great Britain Haiti
1006 1868
Hungary India
999 1320
Indonesia Iran
1922 2545
Iraq Japan
1093 2598
Jordan Kuwait
1178 985
Lebanon Libya
1000 1781
Malaysia Mali
1197 1187
Mexico Moldova
1500 1028
Morocco Netherlands
1803 2566
Norway Pakistan
1015 1108
Palestine Peru
918 1421
Poland Romania
986 1530
Russia Rwanda
1907 2573
Singapore Slovenia
1923 1001
South Africa South Korea
6241 1199
Spain Sweden
2216 985
Switzerland Taiwan
1219 1225
Thailand Trinidad and Tobago
2548 995
Tunisia Turkey
1024 1302
Ukraine Uruguay
939 983
Viet Nam Yemen
1406 739
Zambia
1419
[1] "Response frequencies for agecat :"
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
5571 24536 21796 18827 14374 9827 4955 1286
[1] "Response frequencies for education_cat :"
incomplete or no primary education
15200
no uni
47593
uni
38379
```